from crawlers.userAgent import useragent
import requests
from lxml import etree
import random
import time
import threading


class IPs(object):
    def __init__(self):
        self.url1='http://www.nimadaili.com/gaoni/%d/'   # ip网址1 最大页数为2000 1-350
        self.url2='https://www.89ip.cn/index_%d.html'   # ip网址2  最大页数为1-110
        self.url3='https://www.kuaidaili.com/free/inha/%d/'   # ip网址3  最大页数1-4000
        self.proxies=[]     # 存储所有的ip
        self.userfulProxies=[]    # 存储有效的ip
        self.url='https://www.baidu.com/'  # 用来检测ip是否可用的网址，这里用百度
        self.userAgent=useragent()

    def checkIps(self,ips):
        while True:
            if len(ips)==0:
                break
            proxies=ips.pop()
            headers = {'user-agent': self.userAgent.getUserAgent()}
            try:
                rsp=requests.get(url=self.url,headers=headers,proxies=proxies,timeout=0.5) # 设置超时时间
                if(rsp.status_code==200):
                    self.userfulProxies.append(proxies)
                    # print('========IP{}可用'.format(proxies))  # 测试需要的话可以不注释掉
                    time.sleep(1)   # 休眠1秒钟

            except Exception as e:
                pass
                # print(e)
                # print('========IP{}不可用'.format(proxies))  # 用于测试，可用注销掉的

    def getUserIps(self):  # 得到可用的ip数据
        self.spiderIps()
        ips=self.proxies[:]
        # 爬取的ip总数上百，使用10个线程
        threads=[]
        print('=====Start testing!')
        for i in range(10):
            thread=threading.Thread(target=self.checkIps,args=(ips,))
            thread.start()
            threads.append(thread)

        for th in threads:
            th.join()

        print('IP test completed!')  # ip测试完毕！
        print('(The number of available IPs is:[%d])' % len(self.userfulProxies))  # 总共有效的ip数目为
        print('IP proxy efficiency is--{:.2f}%'.format((len(self.userfulProxies)/len(self.proxies))*100))  # 爬取的ip有效率

        return self.userfulProxies  # 把可用的ip数据返回

    def spiderIps(self):
        ipss=[]   # 用来存储三个网址爬取下来的ip
        userAgent = self.userAgent
        print('===Initializes and is ready to start crawling IP！')
        # 初始化，准备开始爬取ip

        randomPage1=random.randrange(1,300)

        url1 = self.url1   # 爬取网址1上的ip数据
        print('=' * 52)
        for i in range(randomPage1,randomPage1+10):
            headers={'user-agent':userAgent.getUserAgent(),
                     'Referer': 'http://www.nimadaili.com/',
                     'Host': 'www.nimadaili.com'}
            try:
                response=requests.get(url=url1%(i),headers=headers,timeout=6)
                if(response.status_code==200):
                    print("URL-1==page-%d is successful！"%(i))
                    html1=etree.HTML(response.text)
                    ips=html1.xpath("//table[@class='fl-table']/tbody/tr/td[1]/text()")
                    ipsType=html1.xpath("//table[@class='fl-table']/tbody/tr/td[2]/text()")  # ip的类型 http https
                    proxies=["{}://{}".format(ipsType[j][:4].lower(),ips[j]) for j in range(len(ips))]
                    ipss.extend(proxies)
                    time.sleep(3)
                    # 每爬取一页休眠2秒钟
            except Exception as e:
                print(e)
                print("URL-1==>page-%d is fail！" % (i))

        url2=self.url2  # 爬取网址2上的ip数据
        # 这个网址没有提到网址的请求类型http或https
        randomPage2=random.randrange(1,50)
        print('='*52)
        for i in range(randomPage2,randomPage2+10):
            headers={'user-agent':userAgent.getUserAgent()}
            try:
                response2=requests.get(url=url2%(i),headers=headers,timeout=5)
                if(response2.status_code==200):
                    print("URL-2=page-%d is successful！" % (i))
                    HTML1=etree.HTML(response2.text)
                    ips1=HTML1.xpath("//table[@class='layui-table']/tbody/tr/td[1]/text()")
                    ports1=HTML1.xpath("//table[@class='layui-table']/tbody/tr/td[2]/text()")
                    n=len(ips1)
                    ips1=["{}:{}".format(ips1[i][4:-2],ports1[i][4:-2]) for i in range(n)]

                    ips11=[]
                    for ip1 in ips1:
                        ips11.append('{}://{}'.format('http',ip1))
                        ips11.append('{}://{}'.format('https', ip1))
                    ipss.extend(ips11)
                    time.sleep(2)
            except:
                print("URL-2==>page-%d is fail！" % (i))

        randomPage3 = random.randrange(1, 3000)
        url3 = self.url3  # 爬取网址3上的ip数据
        print('=' * 52)
        for i in range(randomPage3, randomPage3 + 10):
            headers = {'user-agent': userAgent.getUserAgent()}
            try:
                response3 = requests.get(url=url3 % (i), headers=headers,timeout=5)
                if response3.status_code == 200:
                    print("URL-3==page-%d is successful！" % (i))
                    HTML2 = etree.HTML(response3.text)
                    ips2 = HTML2.xpath("//table[@class='table table-bordered table-striped']/tbody/tr/td[1]/text()")
                    ports2 = HTML2.xpath("//table[@class='table table-bordered table-striped']/tbody/tr/td[2]/text()")
                    ipsType = HTML2.xpath("//table[@class='table table-bordered table-striped']/tbody/tr/td[4]/text()")
                    n2 = len(ips2)
                    ips2 = ["{}://{}:{}".format(ipsType[i].lower(),ips2[i], ports2[i]) for i in range(n2)]
                    ipss.extend(ips2)
                    time.sleep(2)
            except:
                print("URL-3==>page-%d is fail！" % (i))
        # 直接使用集合方法，去掉重复的ip
        ipss=list(set(ipss))
        print('The total number of IP crawls is {}'.format(len(ipss)))  #  爬取的ip总数为
        proxiess=[{ip[:ip.find('://')]:ip} for ip in ipss]
        self.proxies=proxiessip